Adam Gracy{ width=20% }


knitr::opts_chunk$set(echo = TRUE)

Introduction to the data

Data and variables(see MS pg. 77,125)

Because of the costs (both fiscal and time) of quality assurance for software, developers and engineers tend to focus on certain code. This causes "blind spots" which are the areas that may be non-critical, so they were overlooked.

The data set we examine in this project is called SWDEFECTS. This data set contains a certain number of "modules" of code written in the C programming language. For each of the data points, the researchers evaluated the code, line by line, and if defects were found, they classified it as "true", and "false" was given otherwise.

Several methods for detecting defects were used. Another variable found in the SWDEFECTS data set is whether or not a defect was found using the particular detection methods; yes, or no being used respectively.

The researchers have evaluated the detection algorithms using different probability measures called accuracy, detection rate, false alarm rate, and precision.

The purpose of this paper will be exploring these measures.

Summary Table

Create the summary table (TABLE SIA3.2) found on page 125.

require(pander)
panderOptions('table.split.table', Inf)
set.caption("Table SIA3.2")
my.data <- "Summary Table for Evaluating Defect Prediction Algorithms
                | ***Module   has  ***|***defects***\n|***False*** | ***True***

  ***Algorithm Predicts***&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;no     | a      |   b 
  &nbsp;&nbsp;***defects***&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; yes| c      |    d"
df <- read.delim(textConnection(my.data),header=FALSE,sep="|",strip.white=TRUE,stringsAsFactors=FALSE)
names(df) <- unname(as.list(df[1,])) # put headers on
df <- df[-1,] # remove first row
row.names(df)<-NULL
pander(df, style = 'rmarkdown')

Using $\LaTeX$ construct the formulae for

  1. Accuracy: $P(Algorithm is correct) = \frac{(a+d)}{(a+b+c+d)}$

  2. Detection rate: $P(predict \space defect | module\space has\space defect) = \frac{d}{(b+d)}$

  3. False alarm rate: $P(predict\space defect | module\space has\space no\space defect ) = \frac{c}{(a+c)}$

  4. Precision: $P(module\space has\space defect | predict \space defect) = \frac{d}{(c+d)}$

R functions

Using the above definitions make R functions that will create the required probabilities, please remove eval=FALSE when creating the functions :

acc=function(a,b,c,d)
{

  (a + d) / (a+b+c+d)

  result <- (a + d) / (a+b+c+d)
}
detect=function(b,d)
{

  d / (b+d)

  result <- d / (b+d)


}
falarm=function(a,c)
{

  c / (a + c)

  result <- c / (a+c)


}
prec=function(c,d)
{

  d / (c+d)

  result <- d / (c+d)


}

Create the tables in Figure SIA3.1

PRED_LOC * DEFECT

swd=read.csv("swdefects.csv")

tab=with(swd, table(predict.loc.50,defect))
barplot(tab, beside=TRUE, leg=TRUE)
tab2=addmargins(tab)
tab2

PRED_VG * DEFECT

tab=with(swd, table(predict.vg.10,defect))
barplot(tab, beside=TRUE, leg=TRUE)
tab2=addmargins(tab)
tab2

PRED_EVG * DEFECT

tab=with(swd, table(predict.evg.14.5,defect))
barplot(tab, beside=TRUE, leg=TRUE)
tab2=addmargins(tab)
tab2

PRED_IVG * DEFECT

tab=with(swd, table(predict.ivg.9.2,defect))
barplot(tab, beside=TRUE, leg=TRUE)
tab2=addmargins(tab)
tab2

Create the corrected table

library(dplyr)
# Lets calculate the first row (LOC) 
loc_a <- as.numeric(nrow(filter(swd, defect=="FALSE" & predict.loc.50=="no")))
loc_b <- as.numeric(nrow(filter(swd, defect=="TRUE"  & predict.loc.50=="no")))
loc_c <- as.numeric(nrow(filter(swd, defect=="FALSE" & predict.loc.50=="yes")))
loc_d <- as.numeric(nrow(filter(swd, defect=="TRUE"  & predict.loc.50=="yes")))

loc_acc <-- -1 * acc(loc_a, loc_b, loc_c, loc_d)
loc_detect <-- -1 * detect(loc_b, loc_d)
loc_falarm <-- -1 * falarm(loc_a, loc_c)
loc_prec <- prec(loc_c, loc_d)

# Lets calculate the 2nd row (Cyclomatic complexity)
vg_a <- as.numeric(nrow(filter(swd, defect=="FALSE" &  predict.vg.10=="no")))
vg_b <- as.numeric(nrow(filter(swd, defect=="TRUE"  &  predict.vg.10=="no")))
vg_c <- as.numeric(nrow(filter(swd, defect=="FALSE" &  predict.vg.10=="yes")))
vg_d <- as.numeric(nrow(filter(swd, defect=="TRUE"  &  predict.vg.10=="yes")))

vg_acc <-- -1 * acc(vg_a, vg_b, vg_c, vg_d)
vg_detect <-- -1 * detect(vg_b, vg_d)
vg_falarm <-- -1 * falarm(vg_a, vg_c)
vg_prec <- prec(vg_c, vg_d)

# Lets calculate the 3rd row (Essential Complexity)
evg_a <- as.numeric(nrow(filter(swd, defect=="FALSE" &  predict.evg.14.5=="no")))
evg_b <- as.numeric(nrow(filter(swd, defect=="TRUE"  &  predict.evg.14.5=="no")))
evg_c <- as.numeric(nrow(filter(swd, defect=="FALSE" &  predict.evg.14.5=="yes")))
evg_d <- as.numeric(nrow(filter(swd, defect=="TRUE"  &  predict.evg.14.5=="yes")))

evg_acc <-- -1 * acc(evg_a, evg_b, evg_c, evg_d)
evg_detect <-- -1 * detect(evg_b, evg_d)
evg_falarm <-- -1 * falarm(evg_a, evg_c)
evg_prec <- prec(evg_c, evg_d)

# Lets calculate the final row (Design Complexity)

ivg_a <- as.numeric(nrow(filter(swd, defect=="FALSE" &   predict.ivg.9.2=="no")))
ivg_b <- as.numeric(nrow(filter(swd, defect=="TRUE"  &   predict.ivg.9.2=="no")))
ivg_c <- as.numeric(nrow(filter(swd, defect=="FALSE" &   predict.ivg.9.2=="yes")))
ivg_d <- as.numeric(nrow(filter(swd, defect=="TRUE"  &   predict.ivg.9.2=="yes")))

ivg_acc <-- -1 * acc(ivg_a, ivg_b, ivg_c, ivg_d)
ivg_detect <-- -1 * detect(ivg_b, ivg_d)
ivg_falarm <-- -1 * falarm(ivg_a, ivg_c)
ivg_prec <- prec(ivg_c, ivg_d)


# Make the table
loc_row <-c(loc_acc, loc_detect,loc_falarm,loc_prec)
vg_row <-c(vg_acc, vg_detect, vg_falarm, vg_prec)
evg_row <-c(evg_acc, evg_detect, evg_falarm, evg_prec)
ivg_row <- c(ivg_acc, ivg_detect, evg_falarm, evg_prec)
d <- c(loc_acc, loc_detect,loc_falarm,loc_prec,vg_acc, vg_detect, vg_falarm, vg_prec,evg_acc, evg_detect, evg_falarm, evg_prec,ivg_acc, ivg_detect, ivg_falarm, ivg_prec )
matrix3 <- matrix(d, nrow=4, ncol=4,byrow = T)
rownames(matrix3) <- c("Lines of code", "Cyclomatic complexity", "Essential complexity", "Design complexity")
colnames(matrix3) <- c("Accuracy", "Detection Rate", "False Alarm Rate", "Precision")
tab3 <- as.table(matrix3)

tab3

Create the function

myBar <- function(tab3, roundPrecision = 4){
  tab3 <- round(tab3[,],roundPrecision)

barplot(tab3,ylim = c(0, 1), col=1:4,beside = T, legend.text = c("Lines of Code", "Cyclomatic complexity", "Essential complexity", "Design complexity"), xlab = "Algorithm", ylab = "Probability", args.legend = list(x="topright",title="Method") )

  list(tab3)
}

Call the function with round = 3

myBar(tab3 = tab3, roundPrecision = 3)


agracy2246/MATH4753grac0009 documentation built on April 26, 2020, 9:39 a.m.